"""
# coding     : utf-8 
# Time       : 2025/6/11 10:06
# Author     : chenxianb
# version    : python 3.8.2
# Description: 重庆交通大学官网前10页的时政要闻，并保存到Excel中
"""
import os.path
import time

import openpyxl
import requests
from bs4 import BeautifulSoup


def get_html_text(url):
    """获取html页面"""
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:139.0) Gecko/20100101 Firefox/139.0"}
    try:
        r = requests.get(url, headers=header, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""


def get_pages(html):
    """获取总页数"""
    soup = BeautifulSoup(html, "html.parser")
    # 总页数
    txt = soup.find(id="fanye204111").text.strip()
    pages = int(txt.split("/")[-1])
    return pages


def get_news(html):
    """获取新闻的日期、标题、链接"""
    info = []
    news_date = []
    news_title = []
    news_url = []
    soup = BeautifulSoup(html, "html.parser")

    # 新闻标题、新闻链接
    for item in soup.find_all(class_="right-title"):
        path = item.a["href"]
        title = item.text.strip()
        full_url = f"https://news.cqjtu.edu.cn/{path}"
        news_title.append(title)
        news_url.append(full_url)

    # 发布日期
    for it in soup.find_all("div", "time"):
        date = it.text.strip()
        news_date.append(date)

    for i in range(len(news_date)):
        info.append([news_date[i], news_title[i], news_url[i]])

    return info


def search_news(pages):
    """查询前10页的所有新闻信息"""
    total = []
    page_end = pages - 10
    for page in range(pages, page_end, -1):
        if page == pages:  # 首页
            url = "https://news.cqjtu.edu.cn/szyw.htm"
        else:
            url = f"https://news.cqjtu.edu.cn/szyw/{page}.htm"
        htm = get_html_text(url)
        news_info = get_news(htm)

        for item in news_info:
            total.append(item)

    return total


def save_news(info, path):
    """存放到Excel中"""
    if not os.path.exists(path):
        os.mkdir(path)

    timestamp = int(time.time())
    file = f"{path}/时政要闻{timestamp}.xlsx"

    wb = openpyxl.Workbook(file)
    table = wb.create_sheet("news", 0)
    title = ["日期", "标题", "链接"]
    table.append(title)

    for item in info:
        table.append(item)
    wb.save(file)
    wb.close()
    print("保存成功")


if __name__ == '__main__':
    url = "https://news.cqjtu.edu.cn/szyw.htm"
    path = "./news/"
    html = get_html_text(url)
    pages = get_pages(html)
    info = search_news(pages)
    save_news(info, path)
